#import necessary packages
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from ydata_profiling import ProfileReport
### Let's see the data profiling first
behavior_data = pd.read_csv("Online Behavior Prediction Fake Data.csv")
len(behavior_data)
1171
behavior_data.head()
| User_ID | Pageviews | Time_On_Site | Bounces | Sessions | Form_Submitted | Primary_CTA_clicks | Secondary_CTA_clicks | Tertiary_CTA_clicks | Product_Nav_clicks | Product_Comparison_clicks | Offline_Sale | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | U_001 | 63 | 216 | 0 | 25 | 4 | 3 | 3 | 3 | 3 | 3 | 1 |
| 1 | U_002 | 23 | 27 | 1 | 14 | 0 | 11 | 1 | 1 | 3 | 2 | 0 |
| 2 | U_003 | 39 | 16 | 0 | 17 | 1 | 2 | 1 | 1 | 1 | 1 | 0 |
| 3 | U_004 | 98 | 920 | 1 | 2 | 8 | 45 | 21 | 4 | 5 | 8 | 0 |
| 4 | U_005 | 39 | 46 | 0 | 11 | 0 | 2 | 4 | 0 | 2 | 1 | 0 |
High level summary of the above data:¶
The dataset consists of 1,171 records, each representing a unique user’s online behavior. The data includes various metrics such as pageviews, time spent on the site, sessions, interactions with different calls-to-action (CTAs), and offline sales.
Start data profiline and understand the data¶
data_profile = ProfileReport(behavior_data, title = "Online Behaviral Data and Offline Purchase")
data_profile.to_notebook_iframe()
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Start machine learning process to predict the offline purchase using online behavior data¶
From the data profiling analysis, there is no missing data point in the dataset. As a result, we can the train test split process.
Step 1: select numerical columns and start train test split.¶
behavior_data.columns
Index(['User_ID', 'Pageviews', 'Time_On_Site', 'Bounces', 'Sessions',
'Form_Submitted', 'Primary_CTA_clicks', 'Secondary_CTA_clicks',
'Tertiary_CTA_clicks', 'Product_Nav_clicks',
'Product_Comparison_clicks', 'Offline_Sale'],
dtype='object')
X = behavior_data[['Pageviews', 'Time_On_Site', 'Bounces', 'Sessions',
'Form_Submitted', 'Primary_CTA_clicks', 'Secondary_CTA_clicks',
'Tertiary_CTA_clicks', 'Product_Nav_clicks',
'Product_Comparison_clicks']]
Y = behavior_data['Offline_Sale']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=111)
Step 2: Start machine learning process to predict the offline purchase using logistic regression and random forest classifier.¶
First, using logistic regression to define a baseline for hyper-parameter tuning.
logistic_model = LogisticRegression(random_state=0).fit(X_train, Y_train)
/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
baseline_pred_result = logistic_model.predict(X_test)
Step 3: calculating confusion matrix from the baseline model¶
### initialize metrics
TP = 0 #true positive
TN = 0 #true negative
FP = 0 #false positive
FN = 0 #false negative
for i in range(len(Y_test)):
if baseline_pred_result[i] == 1 and Y_test.iloc[i]==1:
TP +=1
elif baseline_pred_result[i] == 0 and Y_test.iloc[i]==0:
TN +=1
elif baseline_pred_result[i] == 1 and Y_test.iloc[i]==0:
FP +=1
else:
FN +=1
print("The number of True Positives are:", TP)
print("The number of True Negatives are:", TN)
print("The number of False Positives are:", FP)
print("The number of False Negatives are:", FN)
The number of True Positives are: 0 The number of True Negatives are: 105 The number of False Positives are: 1 The number of False Negatives are: 16
Step 4: Calculate standard evaluation metrics: Precision, Recall, F1 score, and Accuracy.¶
Baseline_Precision = TP/(TP + FP)
print(Baseline_Precision)
print("Meaning of the metric: Of all the users the model predicted would make a purchase.")
print("Precision is vital when we want to ensure that the users predicted to make a purchase are indeed likely to do so.")
0.0 Meaning of the metric: Of all the users the model predicted would make a purchase. Precision is vital when we want to ensure that the users predicted to make a purchase are indeed likely to do so.
Baseline_Recall = TP/(TP + FN)
print(Baseline_Recall)
print("Meaning of the metric: Of all the users who actually made a purchase")
print("Recall is crucial if your goal is to identify as many potential purchasers as possible.")
0.0 Meaning of the metric: Of all the users who actually made a purchase Recall is crucial if your goal is to identify as many potential purchasers as possible.
Baseline_F1 = 0
print(Baseline_F1)
print("Meaning of the metric: F1 score suggesting that the model is reasonably accurate in predicting purchases without sacrificing too much in either precision or recall.")
print("F1 Score provides a balanced view, helping to evaluate the model's overall performance in contexts where both precision and recall are important.")
0 Meaning of the metric: F1 score suggesting that the model is reasonably accurate in predicting purchases without sacrificing too much in either precision or recall. F1 Score provides a balanced view, helping to evaluate the model's overall performance in contexts where both precision and recall are important.
Baseline_Accuracy = (TP+TN)/(TP+TN+FP+FN)
print(Baseline_Accuracy)
print("Accuracy indicates how well the model perform on predicting both purchases and non-purchases")
0.860655737704918 Accuracy indicates how well the model perform on predicting both purchases and non-purchases
Intepretation of the above results:¶
My baseline model got 86% accuracy but precision, recall and F1 score are all 0. This result indicates that dataset is highly imbalanced, with a large number of negative instances (e.g., 86% of the data is negative and 14% is positive), my model could achieve 86% accuracy simply by predicting the negative class for all instances.
What to do:¶
I will use SMOTE technique to address the data highly imblanaced program and using random forecast algorithm with hypter-parameter tuning technique to find the best model to accuracy predict both purchases and non purchases.
from imblearn.over_sampling import SMOTE
smote_model = SMOTE(random_state=111)
X_train_balanced, Y_train_balanced = smote_model.fit_resample(X_train, Y_train)
Step 6: Using random forest classifier and hyper-parameter tuning¶
import random
def conf_calculation(pred_result, Y):
tp = 0 #true positive
tn = 0 #true negative
fp = 0 #false positive
fn = 0 #false negative
for i in range(len(Y)):
if pred_result[i] == 1 and Y.iloc[i]==1:
tp +=1
elif pred_result[i] == 0 and Y.iloc[i]==0:
tn +=1
elif pred_result[i] == 1 and Y.iloc[i]==0:
fp +=1
else:
fn +=1
return tp, tn, fp, fn
def f1_calculator(tp, tn, fp, fn):
current_precision = tp/(tp+fp)
current_recall = tp/(tp+fn)
current_f1 = 2*(current_precision*current_recall)/(current_precision+current_recall)
current_accuracy = (tp+tn)/(tp+tn+fp+fn)
return current_f1, current_precision, current_recall, current_accuracy
def rf_model(x_train_data, y_train_data, X_test, Y_test):
current_best_f1_score = 0 #initialize f1 score tracker to score the best f1 score in tuning process
current_best_precision = 0 #initialize precision tracker to score the precision score in tuning process
current_best_recall = 0 #initialize recall tracker to score the recall score in tuning process
current_best_accuracy = 0 #initialize accuracy tracker to score the accuracy score in tuning process
current_best_model = None #initial a variable to store the current best model
# for the hyper-paramter tuning, I will run 300 times of random grid to make sure I can get the best model as close as possible
for i in range(300):
ne = random.randint(50, 300)
md = random.randint(5, 50)
msl = random.randint(2, 10)
mss = random.randint(2, 10)
for bs in [True, False]:
for cr in ['gini', 'entropy', 'log_loss']:
rf_model = RandomForestClassifier(n_estimators=ne, max_depth = md, min_samples_leaf=msl, min_samples_split = mss,
bootstrap = bs, criterion = cr).fit(x_train_data, y_train_data) #train the model
#predict the test value
rf_pred = rf_model.predict(X_test)
#calculate current tp, tn, fp, fn
tp, tn, fp, fn = conf_calculation(rf_pred, Y_test)
#calculate current F1 score
current_f1, current_precision, current_recall, current_accuracy = f1_calculator(tp, tn, fp, fn)
#store the if the current f1 score is larget current best f1 score, then store it and store the best model
if current_f1>current_best_f1_score:
current_best_f1_score = current_f1
current_best_model = rf_model
current_best_precision = current_precision
current_best_recall = current_recall
current_best_accuracy = current_accuracy
return current_best_f1_score, current_best_model, current_best_precision, current_best_recall, current_best_accuracy
current_best_f1_score, current_best_model, current_best_precision, current_best_recall, current_best_accuracy = rf_model(X_train_balanced, Y_train_balanced, X_test, Y_test)
current_best_f1_score
print("the f1 score has been improved from 0 to 0.269")
print("the model demonstrates balance between precision and recall, reflecting a more meaningful ability to predict positive outcomes (purchases)")
the f1 score has been improved from 0 to 0.269 the model demonstrates balance between precision and recall, reflecting a more meaningful ability to predict positive outcomes (purchases)
current_best_precision
print("the precision score has been improved from 0 to 0.194")
print("it shows that the model is starting to make accurate positive predictions")
the precision score has been improved from 0 to 0.194 it shows that the model is starting to make accurate positive predictions
current_best_recall
print("the recall score has been improved from 0 to 0.4375")
print("as the model is now able to capture a substantial portion of actual purchasers")
the recall score has been improved from 0 to 0.4375 as the model is now able to capture a substantial portion of actual purchasers
Future Work Discussion:¶
Fine-tuning the decision threshold could help achieve a better balance between precision and recall, thereby improving the F1 score. Additionally, exploring more advanced models, such as ensemble methods or deep learning approaches, may offer better predictive power. Regularly re-evaluating the model with new data and considering the integration of additional behavioral metrics could also enhance its accuracy. By pursuing these strategies, the model’s effectiveness in predicting offline sales can be further strengthened, leading to more precise targeting and optimized marketing efforts.